{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "\n",
    "class MyCrawler:\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "        self.headers =  {\n",
    "            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',\n",
    "        }\n",
    "    \n",
    "    def download(self, url):\n",
    "        r = requests.get(url, headers=self.headers)\n",
    "        return r.text\n",
    "    \n",
    "    def extract(self, content, pattern):\n",
    "        result = re.findall(pattern, content)\n",
    "        return result\n",
    "    \n",
    "    def save(self, info):\n",
    "        with open(self.filename, 'a', encoding='utf-8') as f:\n",
    "            for item in info:\n",
    "                f.write('|||'.join(item) + '\\n')\n",
    "    \n",
    "    def crawl(self, url, pattern, headers=None):\n",
    "        if headers:\n",
    "            self.headers.update(headers)\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = 'https://book.douban.com/tag/?view=type'\n",
    "content = douban_crawler.download(url)\n",
    "tree = html.fromstring(content)\n",
    "tags = tree.xpath(\"//td/a/text()\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'%E5%B0%8F%E8%AF%B4'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.quote(tags[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current tag: 小说\n",
      "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T\n",
      "Last Start ID:  7600\n",
      "活着\n",
      "房思琪的初恋乐园\n",
      "白夜行\n",
      "解忧杂货店\n",
      "红楼梦\n",
      "追风筝的人\n",
      "百年孤独\n",
      "小王子\n",
      "围城\n",
      "平凡的世界（全三部）\n",
      "嫌疑人X的献身\n",
      "霍乱时期的爱情\n",
      "1984\n",
      "飘\n",
      "月亮与六便士\n",
      "三体: “地球往事”三部曲之一\n",
      "三体全集: 地球往事三部曲\n",
      "局外人\n",
      "杀死一只知更鸟\n",
      "骆驼祥子\n",
      "------------------------------------\n",
      "Current tag: 外国文学\n",
      "https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6?start=0&type=T\n",
      "Last Start ID:  7640\n",
      "小王子\n",
      "追风筝的人\n",
      "百年孤独\n",
      "飘\n",
      "1984\n",
      "霍乱时期的爱情\n",
      "月亮与六便士\n",
      "月亮和六便士\n",
      "杀死一只知更鸟\n",
      "傲慢与偏见\n",
      "局外人\n",
      "动物农场\n",
      "安徒生童话故事集\n",
      "简爱（英文全本）\n",
      "老人与海\n",
      "基督山伯爵\n",
      "哈利•波特\n",
      "一个陌生女人的来信\n",
      "牧羊少年奇幻之旅\n",
      "肖申克的救赎\n",
      "------------------------------------\n",
      "Current tag: 文学\n",
      "https://book.douban.com/tag/%E6%96%87%E5%AD%A6?start=0&type=T\n",
      "Last Start ID:  7640\n",
      "你当像鸟飞往你的山\n",
      "房思琪的初恋乐园\n",
      "小王子\n",
      "红楼梦\n",
      "百年孤独\n",
      "追风筝的人\n",
      "围城\n",
      "活着\n",
      "平凡的世界（全三部）\n",
      "解忧杂货店\n",
      "撒哈拉的故事\n",
      "霍乱时期的爱情\n",
      "月亮和六便士\n",
      "1984\n",
      "边城\n",
      "局外人\n",
      "许三观卖血记\n",
      "白鹿原: 20周年精装典藏版\n",
      "沉默的大多数: 王小波杂文随笔全编\n",
      "云边有个小卖部\n",
      "------------------------------------\n",
      "Current tag: 经典\n",
      "https://book.douban.com/tag/%E7%BB%8F%E5%85%B8?start=0&type=T\n",
      "Last Start ID:  7820\n",
      "活着\n",
      "小王子\n",
      "红楼梦\n",
      "百年孤独\n",
      "围城\n",
      "飘\n",
      "平凡的世界（全三部）\n",
      "三体全集: 地球往事三部曲\n",
      "骆驼祥子\n",
      "月亮与六便士\n",
      "哈利•波特\n",
      "杀死一只知更鸟\n",
      "霍乱时期的爱情\n",
      "傲慢与偏见\n",
      "1984\n",
      "追风筝的人\n",
      "边城\n",
      "安徒生童话故事集\n",
      "围城\n",
      "白鹿原: 20周年精装典藏版\n",
      "------------------------------------\n",
      "Current tag: 中国文学\n",
      "https://book.douban.com/tag/%E4%B8%AD%E5%9B%BD%E6%96%87%E5%AD%A6?start=0&type=T\n",
      "Last Start ID:  7720\n",
      "活着\n",
      "围城\n",
      "平凡的世界（全三部）\n",
      "骆驼祥子\n",
      "边城\n",
      "城南旧事: 纪念普及版\n",
      "明朝那些事儿（1-9）: 限量版\n",
      "撒哈拉的故事\n",
      "红楼梦\n",
      "白鹿原: 20周年精装典藏版\n",
      "许三观卖血记\n",
      "三体全集: 地球往事三部曲\n",
      "呐喊\n",
      "房思琪的初恋乐园\n",
      "平凡的世界\n",
      "围城\n",
      "沉默的大多数: 王小波杂文随笔全编\n",
      "许三观卖血记\n",
      "朝花夕拾\n",
      "人生海海\n",
      "------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import time\n",
    "import requests\n",
    "from lxml import html\n",
    "import urllib.parse\n",
    "\n",
    "douban_crawler = MyCrawler('douban.txt')\n",
    "\n",
    "tag_list_url = 'https://book.douban.com/tag/?view=type'\n",
    "tag_content = douban_crawler.download(tag_list_url)\n",
    "tag_tree = html.fromstring(tag_content)\n",
    "tags = tag_tree.xpath(\"//td/a/text()\")\n",
    "for tag in tags[:5]:\n",
    "    print('Current tag:', tag)\n",
    "    tag = urllib.parse.quote(tag)\n",
    "    page_id = 1\n",
    "    last_start = 0\n",
    "    while 1:\n",
    "        start_id = 20 * (page_id - 1)\n",
    "        url = 'https://book.douban.com/tag/{}?start={}&type=T'.format(tag, start_id)\n",
    "        print(url)\n",
    "        content = douban_crawler.download(url)\n",
    "        tree = html.fromstring(content)\n",
    "        if page_id == 1:\n",
    "            page_links = tree.xpath(\"//div[@class='paginator']/a[last()]/@href\")\n",
    "            if page_links:\n",
    "                last_start = int(re.findall('start=(\\d+)', page_links[0])[0])\n",
    "                print('Last Start ID: ', last_start)\n",
    "        book_infos = tree.xpath(\"//li[@class='subject-item']\")\n",
    "        for book_info in book_infos:\n",
    "            book_name_elem = book_info.xpath('.//h2/a')[0]\n",
    "            book_name = re.sub('\\s{2,}', '', book_name_elem.text_content().replace('\\n', ''))\n",
    "            book_url = book_name_elem.attrib['href']\n",
    "            book_pub_info = book_info.xpath(\".//div[@class='pub']\")[0].text.strip()\n",
    "            book_intro = 'N/A'\n",
    "            book_intro_elem = book_info.xpath(\".//div[@class='info']/p\")\n",
    "            if book_intro_elem:\n",
    "                book_intro = book_intro_elem[0].text.strip()\n",
    "            print(book_name)\n",
    "        page_id += 1\n",
    "        if start_id == last_start:\n",
    "            break\n",
    "        print('------------------------------------')\n",
    "        break\n",
    "        time.sleep(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "urls = [f'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={start_id}&type=T' for start_id in range(0, 200, 20)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T',\n",
       " 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes\n",
      "Wall time: 1.11 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import concurrent.futures\n",
    "import requests\n",
    "\n",
    "# URLS = ['http://www.163.com/',\n",
    "#         'http://www.sina.com.cn/',\n",
    "#         'http://baidu.com/',\n",
    "#         'http://youdao.com/',\n",
    "#         'http://bing.com/']\n",
    "\n",
    "douban_crawler = MyCrawler('douban.txt')\n",
    "\n",
    "# Retrieve a single page and report the URL and contents\n",
    "def load_url(url):\n",
    "    global douban_crawler\n",
    "    return douban_crawler.download(url)\n",
    "\n",
    "# We can use a with statement to ensure threads are cleaned up promptly\n",
    "with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:\n",
    "    # Start the load operations and mark each future with its URL\n",
    "    future_to_url = {executor.submit(load_url, url): url for url in urls}\n",
    "    for future in concurrent.futures.as_completed(future_to_url):\n",
    "        url = future_to_url[future]\n",
    "        try:\n",
    "            data = future.result()\n",
    "        except Exception as exc:\n",
    "            print('%r generated an exception: %s' % (url, exc))\n",
    "        else:\n",
    "            print('%r page is %d bytes' % (url, len(data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52753 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52973 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54058 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52622 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54098 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53460 bytes\n",
      "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes\n",
      "Wall time: 2.69 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import concurrent.futures\n",
    "\n",
    "# URLS = ['http://www.163.com/',\n",
    "#         'http://www.sina.com.cn/',\n",
    "#         'http://baidu.com/',\n",
    "#         'http://youdao.com/',\n",
    "#         'http://bing.com/']\n",
    "\n",
    "for url in urls:\n",
    "    data = douban_crawler.download(url)\n",
    "    print('%r page is %d bytes' % (url, len(data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
